*Laura Derksen 27 March 2025
*This file prepares data for analysis
*Requires raw data as input


* data and cd
clear all
set more off
set matsize 11000
set cformat %5.3f

cap cd "C:\Users\laura\Dropbox (Frischsenteret)\Research Projects\CGC 2017\EMRI\REPLICATION"
cap cd "C:\Users\laurader\Dropbox (Frischsenteret)\Research Projects\CGC 2017\EMRI\REPLICATION"
cap cd "C:\Users\l.pongeluppe16\Dropbox\Doutorado_PhD_UofT_ROTMAN\Cursos\06 - RA_Anita McGahan\2018_02_15_Malawi\2018_04_26_Health_Malawi\CGC 2017\EMRI\REPLICATION"


********************************************************************************************
*PART 0: Generating dta files for later merge
********************************************************************************************

insheet using "Data/Raw/SiteDataCollection.csv", clear comma
save "Data/Raw/SiteDataCollection.dta", replace


insheet using "Data/Raw/SiteDetails.csv", clear comma
save "Data/Raw/SiteDetails.dta", replace

insheet using "Data/Raw/Staff.csv", clear comma
save "Data/Raw/Staff.dta", replace

********************************************************************************************
*PART 1: Generating cleaner EMR database Data/Prep/FullData.dta
********************************************************************************************

*This file contains the raw data extracted directly from the EMR system
insheet using Data/Raw/RawEMRData.csv, clear comma

*Remove "visits" where no ART was dispensed
drop if ARVDispensedDays==. & ART==2

gsort PatientId DeathDate

*Keep patient initiations and patient visits (drop pre-ART patients)
keep if ART==1 | ART==2 

*Fix dates
foreach x of varlist VisitDate DeathDate BirthDate {
replace `x'=dofC(`x')
format `x' %td
}
 
*Include only patients who we observe initiating ART 
gen sample_1=1
gsort PatientId ART VisitDate
by PatientId: egen MinART=min(ART)
drop if MinART!=1
*99.5% of visits are for patients who we observe initiating ART

*For remaining analysis use ITT: assign patient to the clinic they initiated at 
*We include only patients whose initiations we observe, and where initiation site is clear
gen VisitSiteRaw = VisitSite
replace VisitSite="" if ART!=1
destring VisitSite, force replace
by PatientId: carryforward VisitSite, replace
drop if VisitSite==.
*VisitSite missing for 4% of initiations


*Drop visits where visit site does not match list of sites from which raw data was collected 
merge m:1 VisitSite using Data/Raw/SiteDataCollection
keep if _merge==3
drop _merge

save Data/Prep/FullData, replace


********************************************************************************************
*PART 2: Determining dates of EMR adoption by clinic Data/Prep/EMRDates.dta
********************************************************************************************

*Upload the clean EMR database 
use Data/Prep/FullData, clear

*Keeping only initiations
keep if ART==1

gsort VisitSite VisitDate
order VisitSite VisitDate EntryDelay

*What period do we have data for?
by VisitSite: egen MinInitDate=min(VisitDate)
by VisitSite: egen MaxInitDate=max(VisitDate)
format MinInitDate %td
format MaxInitDate %td

*Key variable is EntryDelay -- it is generated by the EMR system  
*Indicates whether the record was entered in real time (0) or delayed (e.g. 5.67 days after the encounter)
*We will use the first moment of real time data entry -- call this EMRDate0
gen rt=VisitDate if EntryDelay==0
by VisitSite: egen EMRDate0=min(rt)
by VisitSite: egen MinEntryDelay=min(EntryDelay)
drop rt
format EMRDate0 %td
*For some clinics that only adopted in 2019, we don't see real time entry yet within our dataset
replace EMRDate0=MaxInitDate+MinEntryDelay if EMRDate0==.
drop MinEntryDelay

*There are situations where this method does not work
*Early HIV clinics have strange EntryDelay values, often integers, often zero, but it is clear that these were not using the EMR system
*After discussing with Baobab, determined that these early adopters entered paper based records into a different computer system in the beginning (not the EMR system we study here, "J Machines"), and this data was then added to the EMR data at the time of backentry
tab EMRDate0 if year(EMRDate)<2008
gsort EMRDate0 VisitSite VisitDate
order VisitSite EMRDate0 EntryDelay VisitDate
*Opened data browser to see the pattern of EntryDelay for early vs late adopters 
*Can see when data starts to be back-entered directly on the EMR system because there are decimal points
gen RoundEntryDelay=round(EntryDelay)
gen EMRMachine=(RoundEntryDelay!=EntryDelay)
gen RealTimeEntryDate=VisitDate+EntryDelay if EMRMachine==1
format RealTimeEntryDate %td
bysort VisitSite: egen AltEMRDate=min(RealTimeEntryDate)
format AltEMRDate %td

*Compare the two ways of determining adoption dates
order EMRDate0 AltEMRDate
gsort EMRDate0 AltEMRDate VisitSite VisitDate
gen DiffEMRDate=EMRDate0-AltEMRDate
order DiffEMRDate EMRDate0 AltEMRDate
gsort -DiffEMRDate VisitSite VisitDate

*Cleaning entry dates by hand for those with different values
*The above methods are sensitive to a single error at the clinic, so some clinics need to be assessed by hand
*The moment of EMR adoption is obvious for these cases by looking at the pattern of descending EntryDelay values
gen EMRDate=AltEMRDate
replace EMRDate=EMRDate0 if VisitSite==670
replace EMRDate=EMRDate0 if VisitSite==345
replace EMRDate=EMRDate0 if VisitSite==289
replace EMRDate=EMRDate0 if VisitSite==665
replace EMRDate=EMRDate0 if VisitSite==376
replace EMRDate=EMRDate0 if VisitSite==611
replace EMRDate=EMRDate0 if VisitSite==610
replace EMRDate=floor(EMRDate)

collapse (firstnm) EMRDate, by(VisitSite)

format EMRDate %td

gen clinicnum=_n

gsort VisitSite

save Data/Prep/EMRDates, replace

********************************************************************************************
*PART 3: Identifying the clinic-year level sample for analysis Data/Prep/Sample_ClinicYears.dta
********************************************************************************************

use Data/Prep/FullData, clear

*Merge with dates of EMR adoption
merge m:1 VisitSite using Data/Prep/EMRDates
drop _merge

gsort VisitSite VisitDate

*Define event time YearsPostEMR=0 in year immediately following EMR adoption date
gen DaysPostEMR=VisitDate-EMRDate
gen YearsPostEMR = ceil(DaysPostEMR/365)-1

*Define indicators for initiation and refill visit
gen Init=(ART==1)
gen RefVisit=(ART==2)

*How many years of data do we have for each clinic pre/post EMR?
bysort VisitSite: egen MinPost=min(YearsPostEMR)
bysort VisitSite: egen MaxPost=max(YearsPostEMR)

order MinPost EMRDate

*Next we will check which clinics completed data back entry
gcollapse (sum) RefVisit Init (max) MinPost MaxPost, by(YearsPostEMR EMRDate VisitSite)

gsort VisitSite YearsPost

*Drop the first and last event-year (incomplete data for that year)
drop if YearsPost==MinPost 
drop if YearsPost==MaxPost

*Look at year-over-year growth in initiations. This should be gradual if all initiations were back-entered.
by VisitSite: gen IncInit=Init/Init[_n+1]
summ IncInit if YearsPost>=0
*Biggest jump in post period is 0.377 ratio
*Flag those with very large jumps in the pre-period -- these are likely missing back entries
gen Flag=0
replace Flag=1 if IncInit<0.37
replace Flag=0 if IncInit==.

*Looking for errors in backentry (eg years with just a few observations)
gen FlagPost=YearsPostEMR if Flag==1
by VisitSite: egen MaxFlagPost=max(FlagPost)
drop if YearsPostEMR<=MaxFlagPost & MaxFlagPost!=.

drop *Flag* 
drop IncInit

gen Datey=year(EMRDate) + YearsPostEMR 
by VisitSite: egen BackEntryLimit=min(YearsPostEMR)
by VisitSite: egen BackEntryYear=min(Datey)

*Drop clinics that only entered the previous year of data, or whose data doesn't go back to at least 2013
gen FullSample=(BackEntryLimit<=-1)*(BackEntryYear<=2013)
drop if FullSample!=1

keep VisitSite YearsPostEMR MinPost MaxPost

sort VisitSite YearsPostEMR

save Data/Prep/Sample_ClinicYears, replace

********************************************************************************************
*PART 4: Calculating the number of deaths at the clinic-year and clinic-quarter levels
*Data/Prep/Deaths_ClinicYears and Data/Prep/Deaths_ClinicQuarters
********************************************************************************************

use Data/Prep/FullData, clear

*Advanced HIV defined as HIV stage 3 or 4, or CD4 count below 200. Missing are assumed to be non-advanced.
gen AdvHIV=(HIVStage==3)+(HIVStage==4)
replace AdvHIV=1 if (CD4Count>0) & (CD4Count<200)
tab AdvHIV

*Keep only initiations so that we have one observation per patient
keep if ART==1

*Keep only patients who died
drop if DeathDate==.
gen Death=(DeathDate!=.)

gen Male=(Sex==1)
gen DeathMale=(Death==1)*(Male==1)
gen DeathFemale=(Death==1)*(Male==0)

gen Age=round((DeathDate-BirthDate)/365)
replace Age=. if Age<0 | Age>95
gen Age_09=Age<=9
replace Age_09=. if Age==.
gen Age_1017=Age<=17 & Age>=10
replace Age_1017=. if Age==.
gen Age_1849=Age<=49 & Age>=18
replace Age_1849=. if Age==.
gen Age_50plus=Age>=50
replace Age_50plus=. if Age==.

foreach x of varlist Age_* {
gen Death`x'=Death*`x'
}

*Merge with dates of EMR adoption
merge m:1 VisitSite using Data/Prep/EMRDates
drop _merge

gsort VisitSite VisitDate

*Event time YearsPostEMR=0 in year immediately following EMR adoption date
gen DaysPostEMR=DeathDate-EMRDate
gen YearsPostEMR = ceil(DaysPostEMR/365)-1

gcollapse (sum) Death DeathMale DeathFemale DeathAge_*, by(VisitSite YearsPostEMR)

gsort VisitSite YearsPostEMR

*Merge with clinic-years used for main analysis

merge 1:1 VisitSite YearsPostEMR using Data/Prep/Sample_ClinicYears
drop if _merge==1
drop _merge

foreach x of varlist Death* {
replace `x'=0 if `x'==.
}

save Data/Prep/Deaths_ClinicYears, replace

***by clinic-quarter
use Data/Prep/FullData, clear

keep if ART==1

drop if DeathDate==.
gen Death=(DeathDate!=.)

*Merge with dates of EMR adoption
merge m:1 VisitSite using Data/Prep/EMRDates
drop _merge

gsort VisitSite VisitDate

*Event time YearsPostEMR=0 in year immediately following EMR adoption date
gen DaysPostEMR=DeathDate-EMRDate
gen QPostEMR = ceil(DaysPostEMR/91.25)-1

gcollapse (sum) Death, by(VisitSite QPostEMR)

gsort VisitSite QPostEMR

foreach x of varlist Death* {
replace `x'=0 if `x'==.
}

save Data/Prep/Deaths_ClinicQuarters, replace

********************************************************************************************
*PART 5: Produce full individual-visit-level data for analysis
*Data/Analysis/Data_Individual and Data/Analysis/Data_IndividualInit
********************************************************************************************


*Encounter codes for referrals to non-HIV services
use "Data/Raw/RawEncounters.dta", clear

outsheet using "Data/Raw/RawEncounters.csv", replace comma

insheet using "Data/Raw/RawEncounters.csv", clear comma

*drop all that are HIV/ART services
drop if encountertype==25 
drop if encountertype>=51 & encountertype<=54
drop if encountertype<=10
drop if encountertype==39
drop if encountertype==40
drop if encountertype==66
drop if encountertype==68
drop if encountertype==80
drop if encountertype==100
drop if encountertype==104
drop if encountertype==105
drop if encountertype==119
drop if encountertype==122
drop if encountertype==123
drop if encountertype==143
tab encountertype

* Convert the string date to a Stata date
gen date_var = date(VisitDate, "YMD")
drop VisitDate
rename date_var VisitDate
format VisitDate %td

gen Encounter=1

*Here we use VisitMonth instead of VisitDate because there is sometimes a slight mismatch between the ART visit and the referral in the system (few days)
gen VisitMonth=10000*month(VisitDate)+year(VisitDate)

collapse (sum) Encounter (firstnm) encountertype, by(PatientId VisitMonth)

save "Data/Prep/enc.dta", replace


use Data/Prep/FullData, clear

gen VisitMonth=10000*month(VisitDate)+year(VisitDate)

*Merge full data with non-HIV referral "encounters"
merge m:1 PatientId VisitMonth using Data/Prep/enc
drop if _merge==2
drop _merge

gen Death=(DeathDate!=.)
gen Male=(Sex==1)

gen Age=round((VisitDate-BirthDate)/365)
replace Age=. if Age<0 | Age>95
gen Age_09=Age<=9
replace Age_09=. if Age==.
gen Age_1017=Age<=17 & Age>=10
replace Age_1017=. if Age==.
gen Age_1849=Age<=49 & Age>=18
replace Age_1849=. if Age==.
gen Age_50plus=Age>=50
replace Age_50plus=. if Age==.

*ART initiation date
gen Init=(ART==1)
gen InitDate=VisitDate if ART==1
format InitDate %td

*Demographics at time of initiation
gen AgeInit=Age if ART==1
gen WeightInit=Weight if ART==1
gen HeightInit=Height if ART==1
gsort PatientId ART VisitDate
by PatientId: carryforward WeightInit AgeInit HeightInit InitDate, replace


gen BMI=Weight/((Height/100)^2)
*Height is often missing after initiation so assume adults do not change height
replace BMI=Weight/((HeightInit/100)^2) if Age>18 & Age!=.
gen RoundBMI=ceil(BMI)
replace BMI=. if BMI<10
replace BMI=. if BMI>100

*https://www.who.int/toolkits/child-growth-standards/standards/body-mass-index-for-age-bmi-for-age
*https://www.who.int/tools/growth-reference-data-for-5to19-years/indicators/bmi-for-age
*Underweight: BMI z-score<-2
*Girls:
*0-1 14
*2-9 13
*10-12 14
*13-14 15
*15-17 16
*Boys:
*0-2 14
*3-8 13
*9-11 14
*12-13 15
*14-15 16
*16-17 17
*Adult: 18.5
gen Underweight=.
replace Underweight=0 if Age!=. & Male!=. & BMI!=.
replace Underweight=1 if BMI<14 & Male==0 & Age>=0 & Age<=1
replace Underweight=1 if BMI<13 & Male==0 & Age>=2 & Age<=9
replace Underweight=1 if BMI<14 & Male==0 & Age>=10 & Age<=12
replace Underweight=1 if BMI<15 & Male==0 & Age>=13 & Age<=14
replace Underweight=1 if BMI<16 & Male==0 & Age>=15 & Age<=17

replace Underweight=1 if BMI<14 & Male==1 & Age>=0 & Age<=2
replace Underweight=1 if BMI<13 & Male==1 & Age>=3 & Age<=8
replace Underweight=1 if BMI<14 & Male==1 & Age>=9 & Age<=11
replace Underweight=1 if BMI<15 & Male==1 & Age>=12 & Age<=13
replace Underweight=1 if BMI<16 & Male==1 & Age>=14 & Age<=15
replace Underweight=1 if BMI<17 & Male==1 & Age>=16 & Age<=17

replace Underweight=1 if BMI<18.5 & Age>18 & Age!=.

replace Underweight=0 if Underweight==.
replace Underweight=. if Age>18 & AgeInit<18
*Note we don't usually have an accurate height measure for these adults

merge m:1 VisitSite using Data/Prep/EMRDates
drop _merge

*Event time YearsPostEMR=0 in year immediately following EMR adoption date
gen DaysPostEMR=VisitDate-EMRDate
gen YearsPostEMR = ceil(DaysPostEMR/365)-1
*Approx calendar year (to line up with event time)
gen Datey=year(EMRDate) + YearsPostEMR
gen EMRy=year(EMRDate)
gen Post=YearsPostEMR>-1


*Keep only last visit before EMR -- only a few clinics back-entered more visits than this
gsort VisitSite PatientId VisitDate
gen RefDate=VisitDate if ART==2
by VisitSite: egen MinRefDate=min(RefDate)
by VisitSite: egen MaxRefDate=max(RefDate)
gen RefPreDate=RefDate if VisitDate<=EMRDate
by VisitSite PatientId: egen LastRefPreDate=max(RefPreDate)
drop if RefDate<LastRefPreDate & LastRefPreDate!=.

*Construct defaults (missed appointment)
*In general, a person defaults if
gen DefDate=VisitDate+ARVDispensedDays+60
replace DefDate=VisitDate+180+60 if ARVDispensedDays>180
*Assume these are typos (unlikely to dispense more than 6 months of medication)
format DefDate %td

gen Default=0
replace Default=1 if DefDate<VisitDate[_n+1] & PatientId==PatientId[_n+1]
replace Default=1 if PatientId!=PatientId[_n+1] & DefDate<=MaxRefDate
replace Default=. if PatientId!=PatientId[_n+1] & DefDate>MaxRefDate

replace Default=. if LastRefPreDate!=. & LastRefPreDate>DefDate 
*We do not observe visits between initiation/visit and last visit pre-EMR


//HIV Stage
gen AdvHIV=(HIVStage==3)+(HIVStage==4)
replace AdvHIV=1 if (CD4Count>0) & (CD4Count<200)
tab AdvHIV

*Health measures at time of initiation
gen AHI=AdvHIV if ART==1
gen UI=Underweight if ART==1
gen BMII=BMI if ART==1
bysort PatientId: egen AdvHIVInit=max(AHI)
bysort PatientId: egen UnderweightInit=max(UI)
bysort PatientId: egen BMIInit=max(BMII)
drop UI
drop AHI
drop BMII

merge m:1 VisitSite using Data/Raw/SiteDetails
drop _merge

*Before restricting sample period, save the last prior visit date
gsort VisitSite PatientId VisitDate
by VisitSite PatientId: gen VisitNum=_n
tab VisitNum
*115 visits max
gen PreviousVisitDate=.
replace PreviousVisitDate=VisitDate[_n-1] if PatientId==PatientId[_n-1]
replace PreviousVisitDate=. if VisitDate<=LastRefPreDate
format PreviousVisitDate %td

*Keep only visits during the sample period
merge m:1 VisitSite YearsPostEMR using Data/Prep/Sample_ClinicYears
keep if _merge==3
drop _merge

*ARV doses prescribed 
gen ARVInit=ARVDispensedDays if ART==1
gen ARVRef=ARVDispensedDays if ART==2

*Calculate clinic size in 2013, normalized
gsort VisitSite Datey
by VisitSite Datey: egen ClinicSize2013=sum(Init)
replace ClinicSize2013=. if Datey!=2013
by VisitSite Datey: gen tempid=_n
replace ClinicSize2013=. if tempid!=1
drop tempid
*Mean and SD across clinics
egen MeanCS=mean(ClinicSize2013)
egen SDCS=sd(ClinicSize2013)
gen StdClinicSize2013=(ClinicSize2013-MeanCS)/SDCS
bysort VisitSite: egen ClinicSize_z=min(StdClinicSize2013)
bysort VisitSite: egen ClinicSize=min(ClinicSize2013)
drop StdClinicSize MeanCS SDCS ClinicSize2013

gen large=ClinicSize_z>0

gsort VisitSite PatientId VisitDate

*TB treatment
gen TBT=(TbStatus==7458)

*Reclassify returning/transfer in patients
gen RefVisit=(ART==2)
gen Return=(ART==2)
replace Return=1 if Init==1 & EverART==1065
gen Unclassified=Init*(EverART!=1066)*(EverART!=1065)
replace Unclassified=1 if Init==1 & EverART==1065
replace Return=1 if Unclassified==1
gen New=Init
replace New=0 if EverART!=1066

gen InitMonth=month(InitDate)
gen InitYear=year(InitDate)

gen ViralSuppressed=0 if ViralLoad!=.
replace ViralSuppressed=1 if ViralLoad<200
replace ViralLoad=ViralLoad/1000

gen Female=1-Male
gen MonthsInit=(VisitDate-InitDate)*12/365
gen YearsInit=MonthsInit/12

replace Encounter=0 if Encounter==.

save Data/Analysis/Data_Individual, replace


keep if Init==1

save Data/Analysis/Data_IndividualInit, replace


********************************************************************************************
*PART 6: Produce individual-year-level data for analysis
*Data/Analysis/Data_IndividualYears
********************************************************************************************


use Data/Analysis/Data_Individual, clear

gsort PatientId VisitDate

*Make sure we see the full trajectory for every patient
by PatientId: egen minART=min(ART)
drop if minART!=1

gen FirstVisitEventYear=VisitDate
gen LastVisitEventYear=VisitDate
format FirstVisitEventYear %td
format LastVisitEventYear %td
format LastRefPreDate %td

gen FirstVisitAfterInit=VisitDate if Init==0

gen TBTInit=TBT if Init==1
gen EncounterInit=Encounter if Init==1

gen CD4CountInit=CD4Count if Init==1
replace CD4CountInit=. if CD4Count==0

gcollapse (sum) RefVisit (firstnm) Male InitDate BirthDate EMRDate LastRefPreDate Datey ClinicSize ClinicSize_z ARVInit CD4CountInit TBTInit EncounterInit UnderweightInit AdvHIVInit (max) Return Init New Unclass LastVisitEventYear (min) FirstVisitAfterInit (lastnm) Underweight Default ViralLoad TBT Encounter, by(PatientId YearsPostEMR VisitSite)

gsort PatientId YearsPostEMR
xtset PatientId YearsPostEMR

tsfill, full

save Data/Temp/temp, replace
use Data/Temp/temp, clear

gsort PatientId VisitSite 
by PatientId: carryforward Male InitDate BirthDate VisitSite EMRDate LastRefPreDate ClinicSize ClinicSize_z ARVInit CD4CountInit TBTInit EncounterInit UnderweightInit AdvHIVInit, replace

replace Datey=year(EMRDate) + YearsPostEMR

by PatientId: carryforward InitDate, replace

gsort VisitSite YearsPost
*What is the last open day of the event-year for each clinic?
by VisitSite YearsPost: egen LastDayEventYear=max(LastVisitEventYear)
format LastDayEventYear %td

drop if LastDayEventYear==.
drop if InitDate>LastDayEventYear

gsort VisitSite PatientId YearsPost

foreach x of varlist Init RefVisit New Return Unclass TBT* Encounter* {
	replace `x'=0 if `x'==.
}

gen RegisteredPatient=1
gen InCare=Init
replace InCare=1 if RefVisit>0
gen InCareRef=InCare
replace InCareRef=0 if Init==1

foreach x of varlist InCare InCareRef LastVisitEventYear Underweight Default ViralLoad TBT Encounter {
	replace `x'=. if YearsPost<-1
}
replace RefVisit=. if YearsPost<0
*We don't observe all visits before this

*How old would that patient be at the end of the event-time year?
gen Age=round((LastDayEventYear-BirthDate)/365)
replace Age=. if Age<0 | Age>95
gen Age_09=Age<=9
replace Age_09=. if Age==.
gen Age_1017=Age<=17 & Age>=10
replace Age_1017=. if Age==.
gen Age_1849=Age<=49 & Age>=18
replace Age_1849=. if Age==.
gen Age_50plus=Age>=50
replace Age_50plus=. if Age==.

*When was the patient's last visit as of end of the event year?
gen LastVisit=LastVisitEventYear
format LastVisit %td
*Event time -1
replace LastVisit=LastRefPreDate if YearsPost==-1 & LastVisit==.
replace LastVisit=InitDate if YearsPost==-1 & LastVisit==.
*Event time 0
forvalues i=0/10 {
	replace LastVisit=LastVisit[_n-1] if YearsPost==`i' & LastVisit==.
}

gen MonthsSinceLastVisit=floor((LastDayEventYear-LastVisit)*12/365)
tab MonthsSince
gen MonthsInit=floor((LastDayEventYear-InitDate)*12/365)
gen YearsInit=MonthsInit/12
gen Inity=year(InitDate)

*This rises over time, naturally, as people drop out of care and die
*Rate is 2 months per year on average
reg MonthsSince YearsPost

*Define lapse as gone at least a year
forvalues i = 6 12 : 36 {
	gen Lapse`i'=(MonthsSince>=`i')
	replace Lapse`i'=. if MonthsSince==.
}

gen Post=YearsPost>=0

gen Female=1-Male


merge m:1 VisitSite using Data/Raw/SiteDetails
drop _merge

gen large=ClinicSize_z>0


save Data/Analysis/Data_IndividualYears, replace

********************************************************************************************
*PART 7: Produce clinic-year-level data for analysis
*Data/Analysis/Data_ClinicYears
********************************************************************************************


****Yearly staff data
use Data/Analysis/Data_IndividualInit, clear

keep PatientId VisitSite EMRDate

merge 1:m PatientId using Data/Raw/Staff.dta
drop if _merge==2
drop _merge

gen DaysPostEMR=VisitDate-EMRDate
gen YearsPostEMR = ceil(DaysPostEMR/365)-1

keep staff_reg PatientId VisitDate VisitSite YearsPostEMR

keep if staff_reg!=.

gcollapse (count) PatientId, by(staff_reg YearsPostEMR VisitSite)
gen num_staff_reg=1
gcollapse (sum) num_staff_reg, by(YearsPostEMR VisitSite)


save Data/Prep/Staff_ClinicYears, replace


use Data/Analysis/Data_IndividualYears, clear

*Redefine returning patients to be those who did not start treatment that year
*Assume unclassified are returning as opposed to new
replace Return=1 if Unclassified==1
replace Return=0 if New==1
tab New if InCare==1
tab Return if InCare==1

gen Reg=RegisteredPatient

gsort PatientId YearsPostEMR


foreach x of varlist Male Female Age_* { 
gen InCare`x'=InCare*`x'
gen Return`x'=Return*`x'
gen New`x'=New*`x'
gen Reg`x'=Reg*`x'
}

*We will count these variables at the clinic level for use in extra pre-trend checks
foreach x of varlist TBTInit UnderweightInit AdvHIVInit { 
	replace `x'=0 if Init!=1
}

replace CD4CountInit=. if Init!=1
replace ARVInit=. if Init!=1

gcollapse (sum) RefVisit TBT InCare* Return* New* Reg* Init TBTInit UnderweightInit AdvHIVInit (mean) CD4CountInit ARVInit, by(YearsPostEMR EMRDate Datey VisitSite large ClinicSize ClinicSize_z)

gen EMRyear=year(EMRDate)
gen Post=(YearsPostEMR>=0)

merge 1:1 VisitSite YearsPostEMR using Data/Prep/Deaths_ClinicYears
drop if _merge==2
drop _merge

merge 1:1 VisitSite YearsPostEMR using Data/Prep/Staff_ClinicYears
drop if _merge==2
drop _merge


gsort VisitSite YearsPostEMR

foreach x of varlist Death* InCare* Return* New* Init num_staff_reg TBTInit UnderweightInit AdvHIVInit { 
replace `x'=0 if `x'==.
}
drop if Init==0

foreach x of varlist InCare* Return* { 
replace `x'=. if YearsPost<-1
}

foreach x of varlist Death* InCare* Return* New* TBT num_staff_reg TBTInit UnderweightInit AdvHIVInit {
gen ihst`x'= ln(`x' + sqrt(`x'^2 + 1))
gen log`x'=ln(`x'+1)
gen altlog`x'=ln(`x'+0.01)
}

merge m:1 VisitSite using Data/Raw/SiteDetails
drop _merge
 
merge m:1 VisitSite EMRDate using Data/Prep/EMRDates
drop _merge

save Data/Analysis/Data_ClinicYears, replace 

********************************************************************************************
*PART 8: Produce quarter-level data for analysis

********************************************************************************************

****Quarterly staff data
use Data/Analysis/Data_IndividualInit, clear

keep PatientId VisitSite EMRDate

merge 1:m PatientId using Data/Raw/Staff.dta
drop if _merge==2
drop _merge

gen DaysPostEMR=VisitDate-EMRDate
gen QPostEMR = ceil(DaysPostEMR/91.25)-1

keep staff_* PatientId VisitDate VisitSite QPostEMR

preserve
keep if staff_reg!=.
gcollapse (count) PatientId, by(staff_reg QPostEMR VisitSite)
gen num_staff_reg=1
gcollapse (sum) num_staff_reg, by(QPostEMR VisitSite)
save Data/Temp/temp, replace
restore

keep if staff_treat!=.
keep if QPost>=-1
gcollapse (count) PatientId, by(staff_treat QPostEMR VisitSite)
gen num_staff_treat=1
gcollapse (sum) num_staff_treat, by(QPostEMR VisitSite)

merge 1:1 QPostEMR VisitSite using Data/Temp/temp
replace num_staff_reg=0 if _merge==1
drop _merge

save Data/Prep/Staff_ClinicQuarters, replace


use Data/Analysis/Data_Individual, clear

replace DaysPostEMR=VisitDate-EMRDate
gen QPostEMR = ceil(DaysPostEMR/91.25)-1
gen Dateq=quarter(VisitDate)

preserve

gen Refills=(ART==2)
replace Init=0 if Init==.

collapse (sum) Refills Init, by(VisitDate QPostEMR VisitSite)

gen RefillOnlyDay=(Refills>0)*(Init==0)
gen InitOnlyDay=(Init>0)*(Refills==0)
gen RefillInitDay=(Refills>0)*(Init>0)

collapse (sum) RefillOnly InitOnly RefillInit, by(QPostEMR VisitSite)
save Data/Temp/temp, replace

restore

gen Visits=1
replace Visits=. if QPostEMR<0

gcollapse (sum) Init New Return Unclass Visits (lastnm) TBT Encounter Underweight (firstnm) Male BirthDate InitDate (min) Dateq Datey EMRDate, by(PatientId QPostEMR VisitSite)

generate InCare=1

gcollapse (sum) TBT Encounter Underweight Init New Return Unclass InCare Visits (min) Dateq Datey EMRDate, by(QPostEMR VisitSite)

merge 1:1 VisitSite QPost using Data/Temp/temp
drop if _merge==2
drop _merge

merge 1:1 VisitSite QPost using Data/Prep/Deaths_ClinicQuarters
drop if _merge==2
drop _merge

merge 1:1 VisitSite QPost using Data/Prep/Staff_ClinicQuarters
drop if _merge==2
drop _merge

foreach x of varlist InCare* Return* New* Death* TBT Encounter Underweight { 
replace `x'=0 if `x'==.
}

foreach x of varlist InCare* Return* TBT Encounter Underweight RefillOnlyDay RefillInitDay InitOnlyDay num_staff_treat { 
replace `x'=. if QPost<-1
}

foreach x of varlist Death* InCare* Return* New* TBT Encounter Underweight Visits num_staff_reg num_staff_treat {
gen ihst`x'= ln(`x' + sqrt(`x'^2 + 1))
gen log`x'=ln(`x'+1)
gen altlog`x'=ln(`x'+0.01)
}

gen VisitDay=RefillOnly+RefillInit+InitOnly
gen RefillDay=RefillOnly+RefillInitDay
gen InitDay=InitOnly+RefillInitDay
save Data/Analysis/Data_ClinicQuarters, replace
***********************************

use Data/Analysis/Data_Individual, clear

gsort PatientId VisitDate

*Make sure we see the full trajectory for every patient
by PatientId: egen minART=min(ART)
drop if minART!=1

gen QPostEMR = ceil(DaysPostEMR/91.25)-1

keep if QPostEMR>=-12 & QPostEMR<=12

gsort PatientId VisitDate

gen LastVisitEventQ=VisitDate
format LastVisitEventQ %td

gcollapse (sum) RefVisit Init New Return Unclass Death (firstnm) Male BirthDate InitDate (max) LastVisitEventQ, by(PatientId QPostEMR VisitSite EMRDate LastRefPreDate)

gsort PatientId QPostEMR
xtset PatientId QPostEMR

tsfill, full

gsort PatientId VisitSite
by PatientId: carryforward VisitSite Male BirthDate InitDate EMRDate LastRefPreDate Death, replace

save Data/Temp/temp, replace
use Data/Temp/temp, clear

gsort VisitSite QPost
by VisitSite QPost: egen LastDayEventQ=max(LastVisitEventQ)
format LastDayEventQ %td
format LastRefPreDate %td

drop if LastDayEventQ==.
drop if InitDate>LastDayEventQ

gsort VisitSite PatientId QPost

foreach x of varlist Init RefVisit New Return Unclass {
	replace `x'=0 if `x'==.
}

gen RegisteredPatient=1
gen InCare=Init
replace InCare=1 if RefVisit>0
gen InCareRef=InCare
replace InCareRef=0 if Init==1

foreach x of varlist RefVisit InCare InCareRef LastVisitEventQ{
	replace `x'=. if QPost<-1
}

*How old would that patient be at the end of the event-time quarter?
gen Age=round((LastDayEventQ-BirthDate)/365)
replace Age=. if Age<0 | Age>95
gen Age_09=Age<=9
replace Age_09=. if Age==.
gen Age_1017=Age<=17 & Age>=10
replace Age_1017=. if Age==.
gen Age_1849=Age<=49 & Age>=18
replace Age_1849=. if Age==.
gen Age_50plus=Age>=50
replace Age_50plus=. if Age==.

*When was the patient's last visit as of end of the event quarter?
gen LastVisit=LastVisitEventQ
format LastVisit %td
*Event time -1
replace LastVisit=LastRefPreDate if QPost==-1 & LastVisit==.
replace LastVisit=InitDate if QPost==-1 & LastVisit==.
*Event time 0
forvalues i=0/12 {
	replace LastVisit=LastVisit[_n-1] if QPost==`i' & LastVisit==.
}

gen MonthsSinceLastVisit=floor((LastDayEventQ-LastVisit)*12/365)
tab MonthsSince
gen MonthsInit=floor((LastDayEventQ-InitDate)*12/365)
gen YearsInit=MonthsInit/12
gen Inity=year(InitDate)

gen Post=QPost>=0

*Define lapse as gone at least a year
forvalues i = 6 12 : 36 {
	gen Lapse`i'=(MonthsSince>=`i')
	replace Lapse`i'=. if MonthsSince==.
}


save Data/Analysis/Data_IndividualQuarters, replace


use Data/Raw/SiteDetails, clear

merge 1:1 VisitSite using Data/Prep/EMRDates
drop _merge
gen EMRy=year(EMRDate)

save Data/Prep/MapDates
